{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "import numpy as np \n",
    "import pandas as pd\n",
    "import os,sys,argparse\n",
    "import re\n",
    "\n",
    "def ptm_cdr_wseq(seq,match_pattern):\n",
    "\n",
    "    '''\n",
    "    Cdrs\n",
    "    AA such like \"NG\" \"NS\" \"DG\" on CDRs ;PTM means post translational modification.\n",
    "    match_patten = r'N[GS]|DG'\n",
    "\n",
    "\n",
    "    Wholeseq\n",
    "    AA such like \"N?S\" \"N?T\" on the whole clone sequence.\n",
    "    match_patten = r'N.[ST]'\n",
    "    '''\n",
    "    \n",
    "    match = re.findall(match_pattern,seq)\n",
    "    count = len(match)\n",
    "\n",
    "\n",
    "    return count\n",
    "\n",
    "\n",
    "def cal_ptms(line):\n",
    "    \n",
    "    cdr1counts = ptm_cdr_wseq(line[0],r'N[GS]|DG')\n",
    "    cdr2counts = ptm_cdr_wseq(line[1],r'N[GS]|DG')\n",
    "    cdr3counts = ptm_cdr_wseq(line[2],r'N[GS]|DG')\n",
    "    #wseqcounts = ptm_cdr_wseq(line[3],r'N.[ST]')\n",
    "    #counts = cdr1counts + cdr2counts + cdr3counts + wseqcounts\n",
    "    return pd.Series({'Cdr1ptm':cdr1counts,'Cdr2ptm':cdr2counts,'Cdr3ptm':cdr3counts})\n",
    "    #return  cdr1counts,cdr2counts,cdr3counts\n",
    "\n",
    "\n",
    "def locC(df):\n",
    "    '''\n",
    "    There is supposed to be two CYS on a whole clone sequence. Locate the position of two CYS.\n",
    "    '''\n",
    "    nclist = (df == 'C').sum(axis=0)\n",
    "    loclist = nclist.sort_values(ascending = False).iloc[:2].index.tolist()\n",
    "    \n",
    "    return loclist\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "def indexC(row,loclist):\n",
    "    clist = row[row == 'C'].index.tolist()    \n",
    "    A = set(loclist) | set(clist)\n",
    "    B = set(loclist) & set(clist)\n",
    "    nlist = map(str,A - B)\n",
    "    locate = ','.join(nlist)\n",
    "    #return len(A - B),locate\n",
    "    return pd.Series({'Cptm':len(A - B), 'Cptm_locate':locate})\n",
    "\n",
    "\n",
    "df_merge_clone = pd.DataFrame([['GFTFSRYS','ITSSSRYI','AARCPACGDCCVYFYW','771'],\\\n",
    "['NGNGCCG','ITTTTTNGSNGNGNGT','AATCITGSNGSECTW','589'],['NGNCCGT','ITTTTTNGSNGNGN\\\n",
    "GT','AATCITGWGNGSECTW','334'],['NGNCCNGT','INSTTTNGSNGNGNGT','AATCICGGSNGSECTW','234'\\\n",
    "]],columns = ['Cdr1','Cdr2','Cdr3','Counts'])\n",
    "\n",
    "df_merge_clone['seqlen']=df_merge_clone['Cdr3'].map(lambda x : len(x))\n",
    "df_clone_filter = df_merge_clone[df_merge_clone['seqlen']==df_merge_clone['seqlen'].max()]\n",
    "df_clone_filter\n",
    "df = df_clone_filter['Cdr3'].str.split('',expand=True)\n",
    "loclist = locC(df)\n",
    "\n",
    "df_clone_filter[['Cptm','Cptm_locate']] = df.apply(indexC,loclist = loclist,axis = 1)\n",
    "\n",
    "df_clone_filter[['Cdr1ptm','Cdr2ptm','Cdr3ptm']] = df_clone_filter.apply(cal_ptms,axis=1)\n",
    "df_clone_filter\n",
    "#pd.merge(df_clone_filter,df_clone_filter.apply(cal_ptms,axis=1),left_index=True, right_index=True)\n",
    "\n",
    "\n",
    "df_clone_filter['Allptms'] = df_clone_filter.Cptm + df_clone_filter.Cdr1ptm + df_clone_filter.Cdr2ptm + df_clone_filter.Cdr3ptm\n",
    "df_clone_filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    [4, 10,11,14,7]\n",
       "2              [0, ]\n",
       "3             [1, 6]\n",
       "dtype: object"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.apply(indexC,loclist = loclist,axis = 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#df_clone_filter[['Cptm','Cptm_locate']] = df.apply(indexC,loclist = loclist,axis = 1)\n",
    "\n",
    "df_clone_filter[['Cdr1ptm','Cdr2ptm','Cdr3ptm']] = df_clone_filter.apply(cal_ptms,axis=1)\n",
    "df_clone_filter\n",
    "#pd.merge(df_clone_filter,df_clone_filter.apply(cal_ptms,axis=1),left_index=True, right_index=True)\n",
    "\n",
    "\n",
    "df_clone_filter['Allptms'] = df_clone_filter.Cptm + df_clone_filter.Cdr1ptm + df_clone_filter.Cdr2ptm + df_clone_filter.Cdr3ptm\n",
    "df_clone_filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cdr1</th>\n",
       "      <th>Cdr2</th>\n",
       "      <th>Cdr3</th>\n",
       "      <th>Counts</th>\n",
       "      <th>seqlen</th>\n",
       "      <th>Cdr1ptm</th>\n",
       "      <th>Cdr2ptm</th>\n",
       "      <th>Cdr3ptm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>GFTFSRYS</td>\n",
       "      <td>ITSSSRYI</td>\n",
       "      <td>AARCPACGDCCVYFYW</td>\n",
       "      <td>771</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NGNCCGT</td>\n",
       "      <td>ITTTTTNGSNGNGNGT</td>\n",
       "      <td>AATCITGWGNGSECTW</td>\n",
       "      <td>334</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NGNCCNGT</td>\n",
       "      <td>INSTTTNGSNGNGNGT</td>\n",
       "      <td>AATCICGGSNGSECTW</td>\n",
       "      <td>234</td>\n",
       "      <td>16</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Cdr1              Cdr2              Cdr3 Counts  seqlen  Cdr1ptm  \\\n",
       "0  GFTFSRYS          ITSSSRYI  AARCPACGDCCVYFYW    771      16        0   \n",
       "2   NGNCCGT  ITTTTTNGSNGNGNGT  AATCITGWGNGSECTW    334      16        1   \n",
       "3  NGNCCNGT  INSTTTNGSNGNGNGT  AATCICGGSNGSECTW    234      16        2   \n",
       "\n",
       "   Cdr2ptm  Cdr3ptm  \n",
       "0        0        0  \n",
       "2        4        1  \n",
       "3        5        1  "
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.apply(indexC,loclist = [4,14],axis = 1)\n",
    "#df_clone_filter['Cptm'],df_clone_filter['Cptm_locate'] = df.apply(indexC,loclist = [4,14],axis = 1)\n",
    "#df_all = df_clone_filter.merge(df.apply(indexC,loclist = [4,14],axis = 1),left_index=True, right_index=True)\n",
    "df_clone_filter.apply(cal_ptms,axis=1)\n",
    "pd.merge(df_clone_filter,df_clone_filter.apply(cal_ptms,axis=1),left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[14, 4]"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df_clone_filter['Cdr3'].str.split('',expand=True).T\n",
    "df['numC'] = (df == 'C').sum(axis=1)\n",
    "df.numC.sort_values(ascending = False).iloc[:2].index.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[4, 14]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df_clone_filter['Cdr3'].str.split('',expand=True)\n",
    "list = (df == 'C').sum(axis=0)\n",
    "list.sort_values(ascending = False).iloc[:2].index.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([4, 7, 10, 11], [4, 14])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clist = df.loc[0][df.loc[0]=='C'].index.tolist()\n",
    "clist,loclist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    (4, 10,11,14,7)\n",
       "2              (0, )\n",
       "3             (1, 6)\n",
       "dtype: object"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.apply(indexC,loclist = [4,14],axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "'Series' object is not callable",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-18-af68207b91fc>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_clone_filter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Cdr3'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\series.py\u001b[0m in \u001b[0;36mmap\u001b[1;34m(self, arg, na_action)\u001b[0m\n\u001b[0;32m   2996\u001b[0m         \"\"\"\n\u001b[0;32m   2997\u001b[0m         new_values = super(Series, self)._map_values(\n\u001b[1;32m-> 2998\u001b[1;33m             arg, na_action=na_action)\n\u001b[0m\u001b[0;32m   2999\u001b[0m         return self._constructor(new_values,\n\u001b[0;32m   3000\u001b[0m                                  index=self.index).__finalize__(self)\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\base.py\u001b[0m in \u001b[0;36m_map_values\u001b[1;34m(self, mapper, na_action)\u001b[0m\n\u001b[0;32m   1002\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1003\u001b[0m         \u001b[1;31m# mapper is a function\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1004\u001b[1;33m         \u001b[0mnew_values\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmap_f\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmapper\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1005\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1006\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mnew_values\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mpandas/_libs/src\\inference.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.map_infer\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32m<ipython-input-18-af68207b91fc>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_clone_filter\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Cdr3'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: 'Series' object is not callable"
     ]
    }
   ],
   "source": [
    "df = df_clone_filter['Cdr3'].map(lambda x : list(x))\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>11</th>\n",
       "      <th>12</th>\n",
       "      <th>13</th>\n",
       "      <th>14</th>\n",
       "      <th>15</th>\n",
       "      <th>16</th>\n",
       "      <th>17</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td></td>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "      <td>R</td>\n",
       "      <td>C</td>\n",
       "      <td>P</td>\n",
       "      <td>A</td>\n",
       "      <td>C</td>\n",
       "      <td>G</td>\n",
       "      <td>D</td>\n",
       "      <td>C</td>\n",
       "      <td>C</td>\n",
       "      <td>V</td>\n",
       "      <td>Y</td>\n",
       "      <td>F</td>\n",
       "      <td>Y</td>\n",
       "      <td>W</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td></td>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "      <td>I</td>\n",
       "      <td>T</td>\n",
       "      <td>G</td>\n",
       "      <td>W</td>\n",
       "      <td>G</td>\n",
       "      <td>N</td>\n",
       "      <td>G</td>\n",
       "      <td>S</td>\n",
       "      <td>E</td>\n",
       "      <td>C</td>\n",
       "      <td>T</td>\n",
       "      <td>W</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "      <td>I</td>\n",
       "      <td>C</td>\n",
       "      <td>G</td>\n",
       "      <td>G</td>\n",
       "      <td>S</td>\n",
       "      <td>N</td>\n",
       "      <td>G</td>\n",
       "      <td>S</td>\n",
       "      <td>E</td>\n",
       "      <td>C</td>\n",
       "      <td>T</td>\n",
       "      <td>W</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17\n",
       "0     A  A  R  C  P  A  C  G  D  C  C  V  Y  F  Y  W   \n",
       "2     A  A  T  C  I  T  G  W  G  N  G  S  E  C  T  W   \n",
       "3     A  A  T  C  I  C  G  G  S  N  G  S  E  C  T  W   "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.23.4'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
